from time import time
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib import offsetbox
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn import manifold, datasets, decomposition, random_projection
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from utility import *
from tsne import tsne as tsne_impl
from word2vec_sample import get_sample_word_vectors
Visualization can be very helpful in studying the data. But it can only be done for low dimension (less than 4).
t-SNE is one of many techniques that try to solve the problem of visualizing high dimensional data.
The visualization of t-SNE can reveal structure of data. Furthermore, t-SNE's result is shown to be better than others.
Original paper by Geoffrey Hinton and Laurens van der Maaten: pdf
For demonstration purpose, MNIST Dataset will be used (http://yann.lecun.com/exdb/mnist)
Please note that, t-SNE is an unsupervised machine learning method. The label is only used for color in visualizations.
# Load MNIST digits dataset
digits = datasets.load_digits()
X = digits.data
y = digits.target
n_samples, n_features = X.shape
print('X shape: (%s, %s)' % (n_samples, n_features))
# Show some sample data
n_img_per_row = 10
img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row))
for i in range(n_img_per_row):
ix = 10 * i + 1
for j in range(n_img_per_row):
iy = 10 * j + 1
img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8))
plt.imshow(img, cmap=plt.cm.binary)
plt.xticks([])
plt.yticks([])
plt.title('A selection from the 64-dimensional digits dataset')
Each data point is a image with 8x8 pixels. We can treat MNIST as a 64 dimension dataset.
Below we try to randomly project the dataset into 2 dimensional space.
rp = random_projection.SparseRandomProjection(n_components=2, random_state=42)
X_projected = rp.fit_transform(X)
plot_embedding_MNIST(X_projected, y, digits, "Random Projection of the digits")
Not much information can be drawn from the visual.
Now let's see how t-SNE performs on visualizing MNIST.
t-SNE is a non-parametric model that try to visualize high dimensional data into two or three dimensional map.
Based on Euclidian distance, t-SNE measures similarity between datapoints by using conditional probabilities:
t-SNE finds the projection Y (low dimension) of X (high dimension) by doing Gradient Descent:
Looking at the cost function:
We can see that t-SNE aims to retain the local structure (neighborhood distance) of similar points, while pay little attention to the global structure.
print("Computing t-SNE embedding")
n_neighbors = 30
t0 = time()
X_tsne_org, steps = tsne_impl(X, no_dims=2, perplexity=n_neighbors, return_steps=True, max_iter=1000)
tsne_time_org = time() - t0
tsne_plot_title_org = "t-SNE embedding of the digits (time %.2fs)" % (tsne_time_org)
print("Computing t-SNE embedding done (time %.2fs)" % (tsne_time_org))
steps_data = np.array(steps)
steps_data = pd.DataFrame(data={'step': steps_data[:,0],
'X1': steps_data[:,1],
'X2': steps_data[:,2],
'y':np.array([y]*(np.int(len(steps) / len(X)))).flatten()})
px.scatter(steps_data, x="X1", y="X2", animation_frame="step", color="y", hover_name="y",
range_x=[np.min(X_tsne_org[:,0]),np.max(np.max(X_tsne_org[:, 0]))],
range_y=[np.min(X_tsne_org[:,1]),np.max(np.max(X_tsne_org[:, 1]))])
Sklearn package also provide out-of-the-box implementation of t-SNE
n_neighbors = 30
tsne = manifold.TSNE(n_components=2, init='random', perplexity=n_neighbors, method='exact', random_state=0)
t0 = time()
X_tsne = tsne.fit_transform(X)
tsne_time = time() - t0
tsne_plot_title = "t-SNE embedding of the digits (time %.2fs)" % (tsne_time)
print("Computing t-SNE embedding done (time %.2fs)" % (tsne_time))
fig, axes = plt.subplots(1, 2, figsize=(15,7))
plot_embedding_MNIST(X_tsne_org, y, digits, tsne_plot_title_org, axes[0])
plot_embedding_MNIST(X_tsne, y, digits, tsne_plot_title + " (sklearn)", axes[1])
So by using t-SNE, we can visualize a high dimension (64) dataset into low dimension (2) map, which human can interprete.
And by checking the map, we can see the dataset has some structure in form of clusters.
In this section, we will compare t-SNE with other three techniques (Principle Components Analysis, Spectral Embedding, and Isomap).
We will see that the visualization of t-SNE is better than all in term of revealing the structure of data.
pca = decomposition.PCA(n_components=2)
t0 = time()
X_pca = pca.fit_transform(X)
pca_time = time() - t0
pca_plot_title = "Principal Components projection of the digits (time %.2fs)" % (pca_time)
print("Computing PCA projection done (time %.2fs)" % (pca_time))
embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack")
t0 = time()
X_se = embedder.fit_transform(X)
se_time = time() - t0
se_plot_title = "Spectral embedding of the digits (time %.2fs)" % (se_time)
print("Computing Spectral embedding done (time %.2fs)" % (pca_time))
n_neighbors = 30
isom = manifold.Isomap(n_neighbors, n_components=2)
t0 = time()
X_isom = isom.fit_transform(X)
isom_time = time() - t0
isom_plot_title = "Isomap embedding of the digits (time %.2fs)" % (isom_time)
print("Computing Isomap embedding done (time %.2fs)" % (isom_time))
fig, axes = plt.subplots(2, 2, figsize=(15,15))
plot_embedding_MNIST(X_tsne, y, digits, tsne_plot_title, axes[0][0])
plot_embedding_MNIST(X_pca, y, digits, pca_plot_title, axes[0][1])
plot_embedding_MNIST(X_se, y, digits, se_plot_title, axes[1][0])
plot_embedding_MNIST(X_isom, y, digits, isom_plot_title, axes[1][1])
One problem of t-SNE is scalability.
Each step of the the gradient descent includes computing the pairwise conditional probabilities of datapoints (O($N^2$)).
In a more recent paper by Laurens van der Maaten: Accelerating t-SNE
He proposed a solution for this problem by using an approcimation based on the Barnes-Hut algorithm. This approach reduces the gradient descent step significantly (O($NlogN$))
print("Computing faster t-SNE embedding")
n_neighbors = 30
tsne = manifold.TSNE(n_components=2, init='pca', perplexity=n_neighbors, method='barnes_hut', random_state=0)
t0 = time()
X_tsne_fast = tsne.fit_transform(X)
tsne_fast_time = time() - t0
tsne_fast_plot_title = "Faster t-SNE embedding of the digits (time %.2fs)" % (tsne_fast_time)
fig, axes = plt.subplots(1, 2, figsize=(15,7))
plot_embedding_MNIST(X_tsne, y, digits, tsne_plot_title, axes[0])
plot_embedding_MNIST(X_tsne_fast, y, digits, tsne_fast_plot_title, axes[1])
There are two parameters:
There is no instruction on how to choose these parameters for any specific dataset.
Furthermore, part of t-SNE is based on random walk, so it is non deterministic.
It is recommended that we plot the t-SNE map with several pair of parameters, and choose the one that look good.
perplexities = [2, 5, 30, 50, 100]
n_iters = [250, 1000, 5000]
X_tsne_list = []
tsne_title_list = []
for i in range(len(perplexities)):
for j in range(len(n_iters)):
tsne_temp = manifold.TSNE(n_components=2, init='pca', n_iter=n_iters[j], perplexity=perplexities[i], method='barnes_hut', random_state=0)
t0 = time()
X_tsne_temp = tsne_temp.fit_transform(X)
tsne_time_temp = time() - t0
tsne_plot_title_temp = "(iter %s - perplexity %s) (time %.2fs)" % (n_iters[j], perplexities[i], tsne_time_temp)
X_tsne_list.append(X_tsne_temp)
tsne_title_list.append(tsne_plot_title_temp)
print('Computing t-SNE embedding done (iter %s - perplexity %s) (time %.2fs)' % (n_iters[j], perplexities[i], tsne_time_temp))
fig, axes = plt.subplots(5, 3, figsize=(15,25))
for i in range(5):
for j in range(3):
plot_embedding_MNIST(X_tsne_list[i*3+j], y, digits, tsne_title_list[i*3+j], axes[i][j])
The aim of t-SNE is only visualizing high dimensional data into low dimension.
The authors do not recommend using t-SNE as a dimensional reduction tool for number of dimension over 3.
tsne_3d = manifold.TSNE(n_components=3, init='pca', perplexity=n_neighbors, method='barnes_hut', random_state=0)
X_tsne_3d = tsne_3d.fit_transform(X)
fig = px.scatter_3d(
pd.DataFrame(data={'X1': X_tsne_3d[:, 0], 'X2': X_tsne_3d[:, 1], 'X3': X_tsne_3d[:, 2], 'y': y}),
x='X1', y='X2', z='X3',
color='y')
fig.show()
One further notice, since t-SNE uses conditional probability to measure similarity.
The distance, over a certain threshold, won't hold any meaning anymore.
In other word, we can only say that datapoints that is close to each other is similar. But the distant between clusters does not mean that a cluster is more similar to another than any other.
We want to stress again that t-SNE purpose is only for visualization.
To demonstrade, we will try to apply t-SNE as a preprocessing tool before clustering or prediction.
# t-SNE as input to clustering
perplexity = 30
n_iter = 1000
tsne = manifold.TSNE(n_components=2, init='pca', perplexity=perplexity, n_iter=n_iter, method='barnes_hut', random_state=0)
X_tsne = tsne.fit_transform(X)
kmeans_10 = KMeans(n_clusters=10, random_state=0).fit(X_tsne)
y_kmeans_10 = kmeans_10.labels_
kmeans_12 = KMeans(n_clusters=12, random_state=0).fit(X_tsne)
y_kmeans_12 = kmeans_12.labels_
fig, axes = plt.subplots(2, 2, figsize=(15,15))
plot_embedding_MNIST(X_tsne, np.zeros(len(X_tsne)), digits, 't-SNE result with no label', axes[0][0])
plot_embedding_MNIST(X_tsne, y, digits, 't-SNE result with label', axes[0][1])
plot_embedding_MNIST(X_tsne, y_kmeans_10, digits, 'Clustering using Kmeans (n clusters = 10)', axes[1][0])
plot_embedding_MNIST(X_tsne, y_kmeans_12, digits, 'Clustering using Kmeans (n clusters = 12)', axes[1][1])
We can see some incorrect clusters.
The reason is clustering algorithms like kmeans measure similarity by distance. While t-SNE does not preserve the distance when reducing dimension.
# t-SNE as input to prediction
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
tsne = manifold.TSNE(n_components=2, init='pca', perplexity=n_neighbors, method='barnes_hut', random_state=0)
X_tsne_train = tsne.fit_transform(X_train)
X_tsne_test = tsne.fit_transform(X_test)
y_dt = DecisionTreeClassifier().fit(X_train, y_train).predict(X_test)
y_dt_tsne = DecisionTreeClassifier().fit(X_tsne_train, y_train).predict(X_tsne_test)
y_knn = KNeighborsClassifier().fit(X_train, y_train).predict(X_test)
y_knn_tsne = KNeighborsClassifier().fit(X_tsne_train, y_train).predict(X_tsne_test)
y_mlp = MLPClassifier(random_state=0).fit(X_train, y_train).predict(X_test)
y_mlp_tsne = MLPClassifier(random_state=0).fit(X_tsne_train, y_train).predict(X_tsne_test)
scores = []
def calculate_scores(y_true, y_pred, method_name):
return [
method_name,
accuracy_score(y_true, y_pred),
precision_score(y_true, y_pred, average='micro'),
recall_score(y_true, y_pred, average='micro'),
f1_score(y_true, y_pred, average='micro')
]
scores.append(calculate_scores(y_test, y_dt, 'Decision Tree'))
scores.append(calculate_scores(y_test, y_dt_tsne, 'Decision Tree with t-SNE'))
scores.append(calculate_scores(y_test, y_knn, 'KNNs'))
scores.append(calculate_scores(y_test, y_knn_tsne, 'KNNs with t-SNE'))
scores.append(calculate_scores(y_test, y_mlp, 'Neural Network'))
scores.append(calculate_scores(y_test, y_mlp_tsne, 'Neural Network with t-SNE'))
scores = pd.DataFrame(data=scores)
scores.columns = ['Method', 'Accuracy', 'Precision', 'Recall', 'F1']
print(scores)
The reason for this significant drop in scores is because:
So the structure of train and test set won't be the same in low dimension.
In the example using MNIST dataset, t-SNE can visualize the data with somewhat clear clusters. It indicates that the classes are seperatable. As the result, we can obtain high performance on classification task even with simple models, without any data preprocessing.
But what about more complex dataset with no clear seperation between classes.
To illustrate this point, Urban Land Cover Data Set will be used.
ulc_data_train = pd.read_csv('ULC_Dataset/training.csv')
ulc_data_test = pd.read_csv('ULC_Dataset/testing.csv')
print('ULC Dataset (n_samples, n_features):', ulc_data_train.shape)
X_ulc_train, y_ulc_train = ulc_data_train.iloc[:, 1:], ulc_data_train.iloc[:, 0]
X_ulc_test, y_ulc_test = ulc_data_test.iloc[:, 1:], ulc_data_test.iloc[:, 0]
perplexity = 20
n_iter = 2000
tsne_ulc = manifold.TSNE(n_components=2, init='pca', perplexity=perplexity, n_iter=n_iter, method='barnes_hut', random_state=0)
X_ulc_tsne = tsne_ulc.fit_transform(X_ulc_train)
fig = px.scatter(
pd.DataFrame(data={'X1': X_ulc_tsne[:, 0], 'X2': X_ulc_tsne[:, 1], 'y': y_ulc_train}),
x='X1', y='X2', color='y')
fig.show()
t-SNE couldn't clearly seperate the classes.
Below is the result when we use the data, without preprocessing, as input to classification models.
y_ulc_dt = DecisionTreeClassifier().fit(X_ulc_train, y_ulc_train).predict(X_ulc_test)
y_ulc_knn = KNeighborsClassifier().fit(X_ulc_train, y_ulc_train).predict(X_ulc_test)
y_ulc_mlp = MLPClassifier(random_state=0).fit(X_ulc_train, y_ulc_train).predict(X_ulc_test)
scores_ulc = []
scores_ulc.append(calculate_scores(y_ulc_test, y_ulc_dt, 'Decision Tree'))
scores_ulc.append(calculate_scores(y_ulc_test, y_ulc_knn, 'KNNs'))
scores_ulc.append(calculate_scores(y_ulc_test, y_ulc_mlp, 'Neural Network'))
scores_ulc = pd.DataFrame(data=scores_ulc)
scores_ulc.columns = ['Method', 'Accuracy', 'Precision', 'Recall', 'F1']
print(scores_ulc)
KNNs and Neural Network perform really poor (less than 0.5 accuracy).
There are three possible reasons:
So, what if we preproccess our data before feeding it into our models.
Below, the dataset goes through two preprocessing steps:
pca_ulc = decomposition.PCA(n_components=10)
scaler_ulc = StandardScaler()
X_ulc_train_pca = pca_ulc.fit_transform(scaler_ulc.fit_transform(X_ulc_train))
X_ulc_test_pca = pca_ulc.transform(scaler_ulc.transform(X_ulc_test))
perplexity = 30
n_iter = 1000
tsne_ulc = manifold.TSNE(n_components=2, init='pca', perplexity=perplexity, n_iter=n_iter, method='barnes_hut', random_state=0)
X_ulc_tsne_scaled = tsne_ulc.fit_transform(X_ulc_train_pca)
fig = px.scatter(
pd.DataFrame(data={'X1': X_ulc_tsne_scaled[:, 0], 'X2': X_ulc_tsne_scaled[:, 1], 'y': y_ulc_train}),
x='X1', y='X2', color='y')
fig.show()
Now t-SNE can seperate classes better than before.
Below is the result when we use the data, with preprocessing, as input to classification models.
y_ulc_dt = DecisionTreeClassifier(random_state=0).fit(X_ulc_train_pca, y_ulc_train).predict(X_ulc_test_pca)
y_ulc_knn = KNeighborsClassifier().fit(X_ulc_train_pca, y_ulc_train).predict(X_ulc_test_pca)
y_ulc_mlp = MLPClassifier(random_state=0).fit(X_ulc_train_pca, y_ulc_train).predict(X_ulc_test_pca)
scores_ulc = []
scores_ulc.append(calculate_scores(y_ulc_test, y_ulc_dt, 'Decision Tree'))
scores_ulc.append(calculate_scores(y_ulc_test, y_ulc_knn, 'KNNs'))
scores_ulc.append(calculate_scores(y_ulc_test, y_ulc_mlp, 'Neural Network'))
scores_ulc = pd.DataFrame(data=scores_ulc)
scores_ulc.columns = ['Method', 'Accuracy', 'Precision', 'Recall', 'F1']
print(scores_ulc)
KNNs and Neural Network's performance increase significantly.
We will consider the embedding problem for language.
The goal is to convert words into some kind of data that a machine can perform on.
Word2Vec is a method to solve that problem. It transform words into their corresponding vectors.
After apply Word2Vec, we can use t-SNE to visualize if our embedding process is good.
# Get some sample words and their vectors
tokens_with_vecs = np.array(get_sample_word_vectors(500))
token_vecs = [np.array(arr) for arr in tokens_with_vecs[:, 1]]
tokens = tokens_with_vecs[:, 0]
# Perform t-SNE
perplexity = 30
n_iter = 1000
tsne_w2v = manifold.TSNE(n_components=2, init='pca', perplexity=perplexity, n_iter=n_iter, method='barnes_hut', random_state=0)
X_w2v_tsne = tsne_w2v.fit_transform(token_vecs)
# Plot the t-SNE result
fig = px.scatter(pd.DataFrame(data={'X1': X_w2v_tsne[:, 0], 'X2': X_w2v_tsne[:, 1], 'y': tokens}),
x="X1", y="X2", text="y", log_x=True, size_max=60)
fig.update_traces(textposition='top center')
fig.update_layout(
height=800,
title_text='Word2Vec for words in Alice in Wonderland'
)
fig.show()
We can see t-SNE can not separate a lot of words.
But checking some clusters, we can see some positive result:
t-SNE is a powerful technique for visualizing high dimensional data. It can reveal some structure in the data which is really helpful for the analyzing process.
Some key take-away: